Differential Expression and Functional Analysis of COVID-19 Patients based on Viral Load, Age and Sex

Author

Wiktoria Brandys, Maria Chmielorz

DGE

Metadata and raw counts

library(GEOquery)
Loading required package: Biobase
Loading required package: BiocGenerics
Loading required package: generics

Attaching package: 'generics'
The following objects are masked from 'package:base':

    as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
    setequal, union

Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
    unsplit, which.max, which.min
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.
Setting options('download.file.method.GEOquery'='auto')
Setting options('GEOquery.inmemory.gpl'=FALSE)
library(dplyr)

Attaching package: 'dplyr'
The following object is masked from 'package:Biobase':

    combine
The following objects are masked from 'package:BiocGenerics':

    combine, intersect, setdiff, setequal, union
The following object is masked from 'package:generics':

    explain
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
data_whole <- getGEO("GSE152075", GSEMatrix = TRUE)
Found 1 file(s)
GSE152075_series_matrix.txt.gz
data <- data_whole[[1]]
metadata <- pData(data)

metadata2 <- metadata %>%
  transmute(
    TITLE = title,
    AGE   = as.numeric(gsub("[^0-9]", "", `age:ch1`)),
    GENDER = `gender:ch1`,
    POS    = `sars-cov-2 positivity:ch1`,
    STATE  = as.numeric(`n1_ct:ch1`)
  ) %>%
  filter(!is.na(AGE), !is.na(STATE), GENDER %in% c("M","F"), POS == "pos") %>%
  mutate(
    GENDER = recode(GENDER, "M"="Male", "F"="Female"),
    AGE_GR = if_else(AGE > 60, ">60", "<=60"),
    STATE_GR = case_when(
      STATE < 19  ~ "high",
      STATE <= 24 ~ "medium",
      STATE > 24  ~ "low"
    ),
    STATE_GR = factor(STATE_GR, levels=c("low","medium","high")),
    GENDER   = factor(GENDER, levels=c("Female","Male")),
    AGE_GR   = factor(AGE_GR, levels=c("<=60",">60")),
    POS      = factor(POS)
  )
Warning: There was 1 warning in `transmute()`.
ℹ In argument: `STATE = as.numeric(`n1_ct:ch1`)`.
Caused by warning:
! NAs introduced by coercion
head(metadata2)
TITLE AGE GENDER POS STATE AGE_GR STATE_GR
GSM4602241 POS_001 64 Male pos 18.88 >60 high
GSM4602242 POS_002 30 Female pos 21.18 <=60 medium
GSM4602243 POS_003 47 Male pos 24.24 <=60 low
GSM4602244 POS_004 67 Female pos 18.91 >60 high
GSM4602245 POS_005 62 Male pos 25.62 >60 low
GSM4602246 POS_006 52 Female pos 25.61 <=60 low
summary(metadata2$AGE_GR)
<=60  >60 
 222  155 
summary(metadata2$GENDER)
Female   Male 
   201    176 
summary(metadata2$STATE_GR)
   low medium   high 
    94    190     93 
raw_counts <- read.table("GSE152075_raw_counts_GEO.txt", header = TRUE, sep=" ") %>%
  .[,metadata2$TITLE]
stopifnot(all(colnames(raw_counts) == metadata2$TITLE))
dim(raw_counts)
[1] 35784   377
dim(metadata2)
[1] 377   7

DESeq model

library(DESeq2)
Loading required package: S4Vectors
Loading required package: stats4

Attaching package: 'S4Vectors'
The following objects are masked from 'package:dplyr':

    first, rename
The following object is masked from 'package:utils':

    findMatches
The following objects are masked from 'package:base':

    expand.grid, I, unname
Loading required package: IRanges

Attaching package: 'IRanges'
The following objects are masked from 'package:dplyr':

    collapse, desc, slice
Loading required package: GenomicRanges
Loading required package: Seqinfo
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: 'matrixStats'
The following object is masked from 'package:dplyr':

    count
The following objects are masked from 'package:Biobase':

    anyMissing, rowMedians

Attaching package: 'MatrixGenerics'
The following objects are masked from 'package:matrixStats':

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
    rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
    rowWeightedSds, rowWeightedVars
The following object is masked from 'package:Biobase':

    rowMedians
metadata2_df<-as.data.frame(metadata2)
rownames(metadata2_df)<-metadata2$TITLE

dds <- DESeqDataSetFromMatrix(countData = raw_counts, colData = metadata2_df, design= ~GENDER + AGE_GR + STATE_GR) #loading data
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
dds <- DESeq(dds) #estimating factors
estimating size factors
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
final dispersion estimates
fitting model and testing
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
-- replacing outliers and refitting for 6495 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
plotDispEsts(dds)

PCA

norm_counts <- DESeq2::counts(dds, normalized = T)
vsd <- vst(dds, blind = FALSE)
pca <- prcomp(t(assay(vsd)))

variance_explained <- pca$sdev^2 # squared standard deviation for each PC axis (first PC always explains the most of variance)
percentage_variance_explained <- variance_explained/sum(variance_explained)*100 #calculate percent of variance explained for each PC axis
percentage_variance_explained <- round(percentage_variance_explained,2) # round it to two decimal places

pca_plot <- #table used for plotting
  as.data.frame(pca$x) %>% # extract the "x" object from the "pca" list and convert it to a data.frame object - it consists of coordinates of each sample on all PC axes
  tibble::rownames_to_column(var ="TITLE") %>% # create a new column called "SAMPLE" out of row names (sample ids)
  left_join(metadata2_df,by = "TITLE") %>%  #combine our table with metadata - the rows will be combined based on value in the "SAMPLE" columns
  mutate(group=interaction(GENDER, AGE_GR, STATE_GR))

library(ggplot2)
ggplot(pca_plot, aes(x=PC1,y=PC2))+
  geom_point(aes(col = STATE_GR,shape=GENDER ),size = 3, alpha=0.85)+ facet_wrap(~ AGE_GR) +
  labs(x = percentage_variance_explained[1],y = percentage_variance_explained[2], title="STATE_GR and GENDER")

DEA contrasts

library(tibble)
dds_results_state <- results(dds, contrast = c("STATE_GR","high","low"), alpha = 0.01) %>%
  as.data.frame() %>%
  rownames_to_column("gene")
head(dds_results_state,10)
gene baseMean log2FoldChange lfcSE stat pvalue padj
A1BG 8.9233808 -0.9439156 0.4389110 -2.1505853 0.0315089 0.2437459
A1CF 2.3599496 -0.5595678 0.6269427 -0.8925343 0.3721066 0.7167321
A2M 50.3391117 -0.1046285 0.2629664 -0.3978778 0.6907203 0.8920282
A2ML1 5.4165568 -0.2866131 0.4711100 -0.6083783 0.5429366 0.8202468
A2MP1 0.8180962 0.4046829 0.9024968 0.4484037 0.6538619 NA
A3GALT2 0.1040169 -0.0512267 2.8986608 -0.0176726 0.9859001 NA
A4GALT 5.0191251 -0.5423609 0.4235773 -1.2804295 0.2003941 0.5542282
A4GNT 0.0921835 0.4878577 3.3328973 0.1463765 0.8836242 NA
AAAS 4.2728853 0.3460723 0.3470485 0.9971874 0.3186736 0.6736978
AACS 20.0455592 -0.7864801 0.3107959 -2.5305355 0.0113889 0.1473713
#nrow(dds_results)

Volcano plot

library(ggrepel)
volc <- dds_results_state %>%
  filter(!is.na(padj), !is.na(log2FoldChange)) %>%
  mutate(
    direction = case_when(
      padj < 0.01 & log2FoldChange >=  2 ~ "Up",
      padj < 0.01 & log2FoldChange <= -2 ~ "Down",
      T ~ "Non significant"
    )
  )
top_n <- 10
labels_df <- bind_rows(
  volc %>% filter(direction == "Up")   %>% arrange(padj) %>% slice_head(n = top_n),
  volc %>% filter(direction == "Down") %>% arrange(padj) %>% slice_head(n = top_n)
)

ggplot(volc)+
  geom_point(aes(x=log2FoldChange, y=-log10(padj), color=direction))+
  theme_bw()+
  geom_vline(xintercept = c(-2,2),linetype = "dotted")+
  geom_hline(yintercept = -log10(0.01), linetype = "dotted") +
  scale_color_manual(values = c("Up"="red", "Down"="blue"))+
  ggrepel::geom_text_repel(aes(x=log2FoldChange, y=-log10(padj), label = gene),
    data = labels_df,
    size = 3,
    max.overlaps = Inf
  ) +
  labs(
    title = "STATE_GR high vs low")

DEA contrasts age

dds_results_age <- results(dds, contrast = c("AGE_GR",">60","<=60"), alpha = 0.01)
dds_results_age <- as.data.frame(dds_results_age)

DATA

library(tibble)
down_expressed <- dds_results_state%>% filter(padj < 0.01 & log2FoldChange <= -2)
#View(down_expressed)
up_expressed<-dds_results_state %>% filter(padj < 0.01 & log2FoldChange >= 2)
#View(up_expressed)
genes_down<- down_expressed$gene
genes_down
[1] "ASPM"    "C3orf70" "CNR2"    "EEPD1"   "HERC2P8" "IGHA1"   "PRDM8"  
[8] "PSD"     "SEZ6"   
genes_up<-up_expressed$gene
genes_up
 [1] "AC005515.1" "ACE2"       "C4B"        "CCL2"       "CCL8"      
 [6] "CXCL10"     "CXCL11"     "CXCL13"     "HESX1"      "MMP13"     
[11] "NEXN"       "PARP16"     "SLC38A5"   

Biomart

library(biomaRt)
listEnsembl()
biomart version
genes Ensembl Genes 115
mouse_strains Mouse strains 115
snps Ensembl Variation 115
regulation Ensembl Regulation 115
ensembl<-useEnsembl(biomart="genes")
datasets_list <- listDatasets(ensembl) # display all available datasets
dataset <- useEnsembl(biomart = "genes",dataset = "hsapiens_gene_ensembl") # save the chosen dataset to a variable
filters <- listFilters(dataset)
pages <- attributePages(dataset)
feature_attributes <- listAttributes(mart = dataset, page = "feature_page")
down_genes <- getBM(attributes = c("hgnc_symbol","ensembl_gene_id", "chromosome_name", "uniprotswissprot","description"),
                     filters = c("hgnc_symbol","with_uniprotswissprot"),
                     values = list(genes_down,TRUE),
                     mart = dataset)
down_genes
hgnc_symbol ensembl_gene_id chromosome_name uniprotswissprot description
IGHA1 ENSG00000282633 HSCHR14_3_CTG1 P01876 immunoglobulin heavy constant alpha 1 [Source:HGNC Symbol;Acc:HGNC:5478]
IGHA1 ENSG00000211895 14 P01876 immunoglobulin heavy constant alpha 1 [Source:HGNC Symbol;Acc:HGNC:5478]
C3orf70 ENSG00000187068 3 A6NLC5 chromosome 3 open reading frame 70 [Source:HGNC Symbol;Acc:HGNC:33731]
CNR2 ENSG00000188822 1 P34972 cannabinoid receptor 2 [Source:HGNC Symbol;Acc:HGNC:2160]
PSD ENSG00000059915 10 A5PKW4 pleckstrin and Sec7 domain containing [Source:HGNC Symbol;Acc:HGNC:9507]
SEZ6 ENSG00000063015 17 Q53EL9 seizure related 6 homolog [Source:HGNC Symbol;Acc:HGNC:15955]
ASPM ENSG00000066279 1 Q8IZT6 assembly factor for spindle microtubules [Source:HGNC Symbol;Acc:HGNC:19048]
PRDM8 ENSG00000152784 4 Q9NQV8 PR/SET domain 8 [Source:HGNC Symbol;Acc:HGNC:13993]
EEPD1 ENSG00000122547 7 Q7L9B9 endonuclease/exonuclease/phosphatase family domain containing 1 [Source:HGNC Symbol;Acc:HGNC:22223]
up_genes <- getBM(attributes = c("hgnc_symbol","ensembl_gene_id", "chromosome_name", "uniprotswissprot","description"),
                     filters = c("hgnc_symbol","with_uniprotswissprot"),
                     values = list(genes_up,TRUE),
                     mart = dataset)
up_genes
hgnc_symbol ensembl_gene_id chromosome_name uniprotswissprot description
CXCL10 ENSG00000169245 4 P02778 C-X-C motif chemokine ligand 10 [Source:HGNC Symbol;Acc:HGNC:10637]
CXCL11 ENSG00000169248 4 O14625 C-X-C motif chemokine ligand 11 [Source:HGNC Symbol;Acc:HGNC:10638]
CXCL13 ENSG00000156234 4 O43927 C-X-C motif chemokine ligand 13 [Source:HGNC Symbol;Acc:HGNC:10639]
C4B ENSG00000236625 HSCHR6_MHC_DBB_CTG1 P0C0L5 complement C4B (Chido/Rodgers blood group) [Source:HGNC Symbol;Acc:HGNC:1324]
C4B ENSG00000228454 HSCHR6_MHC_SSTO_CTG1 P0C0L5 complement C4B (Chido/Rodgers blood group) [Source:HGNC Symbol;Acc:HGNC:1324]
C4B ENSG00000228267 HSCHR6_MHC_COX_CTG1 P0C0L5 complement C4B (Chido/Rodgers blood group) [Source:HGNC Symbol;Acc:HGNC:1324]
MMP13 ENSG00000137745 11 P45452 matrix metallopeptidase 13 [Source:HGNC Symbol;Acc:HGNC:7159]
CCL8 ENSG00000108700 17 P80075 C-C motif chemokine ligand 8 [Source:HGNC Symbol;Acc:HGNC:10635]
CCL2 ENSG00000108691 17 P13500 C-C motif chemokine ligand 2 [Source:HGNC Symbol;Acc:HGNC:10618]
HESX1 ENSG00000163666 3 Q9UBX0 HESX homeobox 1 [Source:HGNC Symbol;Acc:HGNC:4877]
ACE2 ENSG00000130234 X Q9BYF1 angiotensin converting enzyme 2 [Source:HGNC Symbol;Acc:HGNC:13557]
SLC38A5 ENSG00000017483 X Q8WUX1 solute carrier family 38 member 5 [Source:HGNC Symbol;Acc:HGNC:18070]
C4B ENSG00000224389 6 P0C0L5 complement C4B (Chido/Rodgers blood group) [Source:HGNC Symbol;Acc:HGNC:1324]
PARP16 ENSG00000138617 15 Q8N5Y8 poly(ADP-ribose) polymerase family member 16 [Source:HGNC Symbol;Acc:HGNC:26040]
NEXN ENSG00000162614 1 Q0ZGT2 nexilin F-actin binding protein [Source:HGNC Symbol;Acc:HGNC:29557]
library(dplyr)
library(tibble)
dds_age <- DESeqDataSetFromMatrix(
  countData = raw_counts,
  colData   = metadata2_df,
  design    = ~ GENDER + AGE_GR + STATE_GR + AGE_GR:STATE_GR
)
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
dds_age <- dds_age[rowSums(counts(dds_age)) >= 10, ]

dds_age <- DESeq(
  dds_age,
  test = "LRT",
  reduced = ~ GENDER + AGE_GR + STATE_GR
)
estimating size factors
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
final dispersion estimates
fitting model and testing
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
-- replacing outliers and refitting for 5934 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
res_age <- results(dds_age, alpha = 0.01) %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>% arrange(padj)

head(res_age)
gene baseMean log2FoldChange lfcSE stat pvalue padj
ANP32B 49.58620 2.126836 0.4225584 30.29606 3.00e-07 0.000927
ACSL1 171.36365 -2.477159 0.6266276 22.17499 1.53e-05 0.009240
LRRK2 81.60135 -2.878249 0.7110629 22.11394 1.58e-05 0.009240
NAMPTP1 93.43408 -2.525624 0.7911499 22.36804 1.39e-05 0.009240
NFIL3 50.02852 -3.107534 0.6916379 22.90567 1.06e-05 0.009240
PLIN2 56.30717 1.417515 0.7299251 22.49070 1.31e-05 0.009240
dds_gender <- DESeqDataSetFromMatrix(
  countData = raw_counts,
  colData   = metadata2_df,
  design    = ~ GENDER + AGE_GR + STATE_GR + GENDER:STATE_GR
)
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
dds_gender <- dds_gender[rowSums(counts(dds_age)) >= 10, ]

dds_gender <- DESeq(
  dds_gender,
  test = "LRT",
  reduced = ~ GENDER + AGE_GR + STATE_GR
)
estimating size factors
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
final dispersion estimates
fitting model and testing
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
-- replacing outliers and refitting for 5972 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters
  for column names in R. [This is a message, not a warning or an error]
res_gender <- results(dds_gender, alpha = 0.01) %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>% arrange(padj)

head(res_gender,10)
gene baseMean log2FoldChange lfcSE stat pvalue padj
MTND5P10 3.498339 0.9536294 1.0411964 28.56316 6.00e-07 0.0097975
RAB30 10.801716 3.3981828 0.6857195 28.65906 6.00e-07 0.0097975
IVNS1ABP 274.560917 -0.1252271 0.5033068 25.73967 2.60e-06 0.0268003
CD3G 10.536983 3.0626171 0.6468687 23.64417 7.30e-06 0.0573101
ZNF416 4.132482 4.2758877 0.8978393 22.41878 1.35e-05 0.0846080
LEF1 3.991028 3.5394282 0.8667679 21.74075 1.90e-05 0.0908664
MTCO1P40 2.112396 3.0874988 0.8453723 21.36742 2.29e-05 0.0908664
TRMT5 11.746925 2.1406203 0.5919983 21.33605 2.33e-05 0.0908664
FGD4 68.726311 -0.1426466 0.5018868 21.08797 2.64e-05 0.0914366
PTPRE 53.886191 -1.4108065 0.6054225 20.65432 3.27e-05 0.0929259
head(res_gender)
gene baseMean log2FoldChange lfcSE stat pvalue padj
MTND5P10 3.498339 0.9536294 1.0411964 28.56316 6.00e-07 0.0097975
RAB30 10.801716 3.3981828 0.6857195 28.65906 6.00e-07 0.0097975
IVNS1ABP 274.560917 -0.1252271 0.5033068 25.73967 2.60e-06 0.0268003
CD3G 10.536983 3.0626171 0.6468687 23.64417 7.30e-06 0.0573101
ZNF416 4.132482 4.2758877 0.8978393 22.41878 1.35e-05 0.0846080
LEF1 3.991028 3.5394282 0.8667679 21.74075 1.90e-05 0.0908664

Plots age

library(ggplot2)
library(dplyr)

top_age_genes <- res_age %>% 
  filter(!is.na(padj)) %>%
  slice_min(padj, n=3) %>%
  pull(gene)

for(g in top_age_genes){
  df <- plotCounts(dds_age, gene = g, intgroup = c("STATE_GR","AGE_GR"),
                 normalized = TRUE, returnData = TRUE)

  plot<-ggplot(df, aes(x = STATE_GR, y = count, color = AGE_GR)) +
    geom_jitter(width = 0.15, alpha = 0.6, size = 2) +
    stat_summary(aes(group = AGE_GR), fun = median, geom = "line") +
    stat_summary(fun = median, geom = "point", size = 3) +
    scale_y_log10() +
    theme_classic() +
    labs(title = g, y = "normalized count (log10)", x = "STATE_GR")
  print(plot)
}

Plots gender

library(ggplot2)
library(dplyr)

top_gender_genes <- res_gender %>% 
  filter(!is.na(padj)) %>%
  slice_min(padj, n=3) %>%
  pull(gene)

for(g in top_gender_genes){
  df <- plotCounts(dds_gender, gene = g, intgroup = c("STATE_GR","GENDER"),
                 normalized = TRUE, returnData = TRUE)

  plot<-ggplot(df, aes(x = STATE_GR, y = count, color = GENDER)) +
    geom_jitter(width = 0.15, alpha = 0.6, size = 2) +
    stat_summary(aes(group = GENDER), fun = median, geom = "line") +
    stat_summary(fun = median, geom = "point", size = 3) +
    scale_y_log10() +
    theme_bw() +
    labs(title = g, y = "normalized count (log10)", x = "STATE_GR")
  print(plot)
}

Functional analysis

STATE

gene_universe <- dds_results_state$gene

Genes of interest

Upregulated

filtered_genes <- dds_results_state %>%
  filter(padj < 0.01 & log2FoldChange > 2) %>%
  pull(gene)

Downregulated

filtered_genes_down <- dds_results_state %>%
  filter(padj < 0.01 & log2FoldChange < -2) %>%
  pull(gene)

length(filtered_genes_down)
[1] 9

Over-representation analysis

library(GO.db)
Loading required package: AnnotationDbi

Attaching package: 'AnnotationDbi'
The following object is masked from 'package:dplyr':

    select
library(clusterProfiler)
clusterProfiler v4.18.4 Learn more at https://yulab-smu.top/contribution-knowledge-mining/

Please cite:

G Yu. Thirteen years of clusterProfiler. The Innovation. 2024,
5(6):100722

Attaching package: 'clusterProfiler'
The following object is masked from 'package:AnnotationDbi':

    select
The following object is masked from 'package:biomaRt':

    select
The following object is masked from 'package:IRanges':

    slice
The following object is masked from 'package:S4Vectors':

    rename
The following object is masked from 'package:stats':

    filter
library(org.Hs.eg.db)
library(enrichplot)
enrichplot v1.30.4 Learn more at https://yulab-smu.top/contribution-knowledge-mining/

Please cite:

Guangchuang Yu, Li-Gen Wang, Yanyan Han and Qing-Yu He.
clusterProfiler: an R package for comparing biological themes among
gene clusters. OMICS: A Journal of Integrative Biology. 2012,
16(5):284-287
GO_terms <- enrichGO(
  gene          = filtered_genes,      # genes of interest
  universe      = gene_universe,       # all detected genes
  OrgDb         = "org.Hs.eg.db",
  pAdjustMethod = "BH",
  pvalueCutoff  = 0.01,
  qvalueCutoff  = 0.01,
  readable      = TRUE,
  keyType       = "SYMBOL",
  ont           = "BP"
)

GO_terms_table <- as.data.frame(GO_terms)
head(GO_terms_table, 3)
ID Description GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue p.adjust qvalue geneID Count
GO:0070098 GO:0070098 chemokine-mediated signaling pathway 5/12 85/17277 0.0588235 84.69118 20.39177 0 5e-07 2e-07 CCL2/CCL8/CXCL10/CXCL11/CXCL13 5
GO:1990868 GO:1990868 response to chemokine 5/12 94/17277 0.0531915 76.58245 19.37155 0 5e-07 2e-07 CCL2/CCL8/CXCL10/CXCL11/CXCL13 5
GO:1990869 GO:1990869 cellular response to chemokine 5/12 94/17277 0.0531915 76.58245 19.37155 0 5e-07 2e-07 CCL2/CCL8/CXCL10/CXCL11/CXCL13 5
library(ggplot2)
library(stringr)

p5 <- dotplot(GO_terms, showCategory = 10, orderBy = "p.adjust")+
  theme(
    axis.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 10),
    panel.grid.major.y = element_blank(),
    plot.title = element_text(hjust = 0.5, face = "bold")
  ) +
  labs(
    title = "GO BP enrichment – STATE_GR (upregulated genes)",
    x = "Gene ratio",
    y = NULL
  )
GO_terms_down <- enrichGO(
  gene          = filtered_genes_down,
  universe      = gene_universe,
  OrgDb         = "org.Hs.eg.db",
  pAdjustMethod = "BH",
  pvalueCutoff  = 0.01,
  qvalueCutoff  = 0.01,
  readable      = TRUE,
  keyType       = "SYMBOL",
  ont           = "BP"
)

GO_terms_down_table <- as.data.frame(GO_terms_down)
head(GO_terms_down_table, 3)
ID Description GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue p.adjust qvalue geneID Count

Blank table.

AGE

library(tibble)

dds_results_age <- dds_results_age %>%
  rownames_to_column("gene")
gene_universe_age <- dds_results_age$gene

Genes of interest

Upregulated

filtered_genes_age <- dds_results_age %>%
  filter(padj < 0.01 & abs(log2FoldChange) > 2) %>%
  pull(gene)

Downregulated

filtered_genes_age_d <- dds_results_age %>%
  filter(padj < 0.01 & abs(log2FoldChange) > -2) %>%
  pull(gene)

Over-representation analysis

GO_terms_age <- enrichGO(
  gene          = filtered_genes_age,
  universe      = gene_universe_age,
  OrgDb         = "org.Hs.eg.db",
  pAdjustMethod = "BH",
  pvalueCutoff  = 0.01,
  qvalueCutoff  = 0.01,
  readable      = TRUE,
  keyType       = "SYMBOL",
  ont           = "BP"
)

GO_terms_age_table <- as.data.frame(GO_terms_age)
head(GO_terms_age_table, 3)
ID Description GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue p.adjust qvalue geneID Count
GO:0002449 GO:0002449 lymphocyte mediated immunity 10/30 374/17277 0.0267380 15.39840 11.74069 0 2e-07 2e-07 CD1C/CR2/FCMR/IFNG/IGHD/IGHG1/IGHG3/IGKC/IGLC2/NKG7 10
GO:0002460 GO:0002460 adaptive immune response based on somatic recombination of immune receptors built from immunoglobulin superfamily domains 10/30 386/17277 0.0259067 14.91969 11.53509 0 2e-07 2e-07 CD1C/CR2/CXCL13/FCMR/IFNG/IGHD/IGHG1/IGHG3/IGKC/IGLC2 10
GO:0050853 GO:0050853 B cell receptor signaling pathway 6/30 75/17277 0.0800000 46.07200 16.31449 0 8e-07 6e-07 FCMR/IGHD/IGHG1/IGHG3/IGKC/MS4A1 6
GO_terms_aged <- enrichGO(
  gene          = filtered_genes_age_d,
  universe      = gene_universe_age,
  OrgDb         = "org.Hs.eg.db",
  pAdjustMethod = "BH",
  pvalueCutoff  = 0.01,
  qvalueCutoff  = 0.01,
  readable      = TRUE,
  keyType       = "SYMBOL",
  ont           = "BP"
)

GO_terms_age_table <- as.data.frame(GO_terms_age)
head(GO_terms_age_table, 3)
ID Description GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue p.adjust qvalue geneID Count
GO:0002449 GO:0002449 lymphocyte mediated immunity 10/30 374/17277 0.0267380 15.39840 11.74069 0 2e-07 2e-07 CD1C/CR2/FCMR/IFNG/IGHD/IGHG1/IGHG3/IGKC/IGLC2/NKG7 10
GO:0002460 GO:0002460 adaptive immune response based on somatic recombination of immune receptors built from immunoglobulin superfamily domains 10/30 386/17277 0.0259067 14.91969 11.53509 0 2e-07 2e-07 CD1C/CR2/CXCL13/FCMR/IFNG/IGHD/IGHG1/IGHG3/IGKC/IGLC2 10
GO:0050853 GO:0050853 B cell receptor signaling pathway 6/30 75/17277 0.0800000 46.07200 16.31449 0 8e-07 6e-07 FCMR/IGHD/IGHG1/IGHG3/IGKC/MS4A1 6
p4 <- dotplot(GO_terms_aged, showCategory = 10, orderBy = "p.adjust")+
  theme(
    axis.text.y = element_text(size = 7),
    axis.text.x = element_text(size = 10),
    panel.grid.major.y = element_blank(),
    plot.title = element_text(hjust = 0.5, face = "bold")
  ) +
  labs(
    title = "GO BP enrichment – AGE_GR (downregulated genes)",
    x = "Gene ratio",
    y = NULL
  )
p3 <- dotplot(GO_terms_age, showCategory = 10, orderBy = "p.adjust")+
  theme(
    axis.text.y = element_text(size = 7),
    axis.text.x = element_text(size = 10),
    panel.grid.major.y = element_blank(),
    plot.title = element_text(hjust = 0.5, face = "bold")
  ) +
  labs(
    title = "GO BP enrichment – AGE_GR (upregulated genes)",
    x = "Gene ratio",
    y = NULL
  )

GENDER

gene_universe_gender <- res_gender$gene

Genes of interest

Downregulated

filtered_genes_gender_down <-res_gender %>%
  filter(padj < 0.01 & log2FoldChange < -2) %>%
  pull(gene)

Upregulated

filtered_genes_gender <- res_gender %>%
  filter(padj < 0.01 & abs(log2FoldChange) > 2) %>%
  pull(gene)

Over-representation analysis

GO_terms_gender <- enrichGO(
  gene          = filtered_genes_gender,
  universe      = gene_universe_gender,
  OrgDb         = "org.Hs.eg.db",
  pAdjustMethod = "BH",
  pvalueCutoff  = 0.01,
  qvalueCutoff  = 0.01,
  readable      = TRUE,
  keyType       = "SYMBOL",
  ont           = "BP"
)

GO_terms_gender_table <- as.data.frame(GO_terms_gender)
head(GO_terms_gender_table, 3)
ID Description GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue p.adjust qvalue geneID Count
GO:0032482 GO:0032482 Rab protein signal transduction 1/1 24/17277 0.0416667 719.875 26.81184 0.0013891 0.0041674 NA RAB30 1
GO_terms_gender_down <- enrichGO(
  gene          = filtered_genes_gender_down,
  universe      = gene_universe_gender,
  OrgDb         = "org.Hs.eg.db",
  pAdjustMethod = "BH",
  pvalueCutoff  = 0.01,
  qvalueCutoff  = 0.01,
  readable      = TRUE,
  keyType       = "SYMBOL",
  ont           = "BP"
)
--> No gene can be mapped....
--> Expected input gene ID: PPP4C,NBN,KAT5,RPF2,MSH6,H1-4
--> return NULL...
GO_terms_gender_table <- as.data.frame(GO_terms_gender_down)
head(GO_terms_gender_table, 3)

Blank table.

p2 <- dotplot(GO_terms_gender, showCategory = 10)+
  theme(
    axis.text.y = element_text(size = 7),
    axis.text.x = element_text(size = 10),
    panel.grid.major.y = element_blank(),
    plot.title = element_text(hjust = 0.5, face = "bold")
  ) +
  labs(
    title = "GO BP enrichment–GENDER (upregulated genes) ",
    x = "Gene ratio",
    y = NULL
  )

We decided to change p adjust for 0.05 for wider range of data, and to observant new patterns

P.adjust < 0.05

filtered_genes_gender_2 <- res_gender %>%
  filter(padj < 0.05 & abs(log2FoldChange) > 2) %>%
  pull(gene)
GO_terms_gender_2 <- enrichGO(
  gene          = filtered_genes_gender_2,
  universe      = gene_universe_gender,
  OrgDb         = "org.Hs.eg.db",
  pAdjustMethod = "BH",
  pvalueCutoff  = 0.05,
  qvalueCutoff  = 0.05,
  readable      = TRUE,
  keyType       = "SYMBOL",
  ont           = "BP"
)

GO_terms_gender_table <- as.data.frame(GO_terms_gender)
head(GO_terms_gender_table, 3)
ID Description GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue p.adjust qvalue geneID Count
GO:0032482 GO:0032482 Rab protein signal transduction 1/1 24/17277 0.0416667 719.875 26.81184 0.0013891 0.0041674 NA RAB30 1
p1 <- dotplot(GO_terms_gender_2, showCategory = 10) +
  theme(
    axis.text.y = element_text(size = 7),
    axis.text.x = element_text(size = 10),
    panel.grid.major.y = element_blank(),
    plot.title = element_text(hjust = 0.5, face = "bold")
  ) +
  labs(
    title = "GO BP enrichment – GENDER (upregulated genes)",
    x = "Gene ratio",
    y = NULL
  )

Plots

library(cowplot)

plot_grid(p1, p2, ncol = 2)

plot_grid(p3, p4, ncol = 2)

plot_grid(p5)